#! /usr/bin/env perl

# This script processes time zone definitions from the Olson database
# and transforms them into the format used by the erlang_localtime
# library.
#
# Some helpful URLs:
# https://www.iana.org/time-zones
# https://github.com/dmitryme/erlang_localtime
# http://www.cstdbill.com/tzdb/tz-how-to.html

# Known bugs (should fix):
# * Discarding past/future Rules considers only year.  Better to
#   use a window [now, one-year-from-now) so that our output rules
#   are valid for at least a year.
# Known bugs (can't fix without upstream changes):
# * Africa/Casablanca: Has more than one DST transition per year.
# * Pacific/Fiji: DST does not start/end on Nth DayOfWeek in month.
# * America/Godthab: Transition time as UTC moves into previous day.

use strict;
use warnings;

use Getopt::Long;
use Time::Local qw( timelocal timelocal_nocheck );
use Data::Dump;

use constant DPW => 7;		# Days per week.
use constant HPD => 24;		# Hours per day.
use constant MPH => 60;		# Minutes per hour.
use constant SPECIFICITY_MAX => 100;

use constant {
  RULE_DATE		=> 0,
  RULE_SAVED		=> 1,
  RULE_TIME		=> 2,
  RULE_LETTERS		=> 3,
  RULE_SPECIFICITY	=> 4,
};
use constant RULE_NULL => [ undef, 0, '0W', '-', 0 ];

my $version = undef;
my $output_file;
my $date;

GetOptions(
  'version=s'	=> \$version,
  'output=s'	=> \$output_file,
  'date=s'	=> \$date,
) or die;

my @data = @ARGV;

my ($current_day, $current_month, $current_year) = do {
  if (defined $date) {
    $date =~ m/^(\d\d\d\d)-?(\d\d)-?(\d\d)$/ or die "parse date \"$date\" as YYYY-MM-DD failed\n";
    ($3, $2 - 1, $1);
  } else {
    my @lt = localtime(time);
    ($lt[3], $lt[4], $lt[5] + 1900);
  }
};

my %rule;
my %rule_base;
my %output;
my @problem;

# Add the null rule.
$rule_base{'-'} = [ 0, RULE_NULL ];

# Have to make three passes through the file.
# First pass: process Rule lines.
process_data(\@data, { Rule => \&rule_line });
# Second pass: process Zone lines.
process_data(\@data, { Zone => \&zone_line });
# Third pass, process Link lines.
process_data(\@data, { Link => \&link_line });

my $ofh;
if (defined $output_file) {
  open($ofh, '>', $output_file) or die "open $output_file for write failed: $!\n";
} else {
  $output_file = 'STDOUT';
  open($ofh, '>&STDOUT') or die "dup $output_file failed: $!\n";
}

printf $ofh "      %%%% Automatically generated from the time zone database%s for %04d-%02d-%02d.\n",
  (defined $version ? " version $version" : ''),
  $current_year, $current_month + 1, $current_day;

print $ofh "\n";
print $ofh "      %% Problems:\n";
print $ofh map("      %%   $_\n", @problem);
print $ofh "\n";

print $ofh join(",\n", map {
  (my $o = $output{$_}) =~ s/#ZONE#/$_/;
  "      $o";
} (sort keys %output)),
  "\n";

close($ofh) or die "close $output_file failed: $!\n";

exit(0);

########################################################################

sub process_data {
  my ($data, $handler) = @_;

  foreach my $f (@$data) {
    my $fh = $f;
    {
      # We open the file this way so that die and warn print the filename.
      no strict 'refs';
      open($fh, '<', $fh) or die "open $fh for read failed: $!";
    }

    # Read the Olson database.
    my ($last_linetype, $last_lineprefix);
    while (my $l = <$fh>) {
      chomp($l);
      $l =~ s/\s*#.*//;		# Remove comments.
      $l =~ s/\s+$//;		# Remove trailing whitespace.
      $l eq '' and next;

      my $linetype = do {
	if ($l =~ m/^((\S+)\s+\S+)/) {
	  # Continuation line.
	  $last_linetype = $2;
	  $last_lineprefix = $1;
	} else {
	  $l = $last_lineprefix . $l;
	}
	$last_linetype;
      };

      if (defined(my $h = $handler->{$linetype})) {
	$h->($l);
      }
    }

    close($fh);
  }
}

# Converts an offset in the format "[+-]?HH:MM" or "[+-]?HH" into minutes.
# For example, "2:00" -> 120, "-0:30" -> -30, "+5" -> 300.
sub offset_minutes {
  my ($off, $adj) = @_;

  my $convert_offset = sub {
    $_[0] =~ m/^([\+\-]?)(\d+)(?::(\d+))?$/
      or die "offset \"$_[0]\" did not match";
    my $m = $2 * MPH;
    if (defined $3) { $m += $3; }
    if ($1 eq '-') { $m = -$m; }
    return $m;
  };

  my $offset = $convert_offset->($off);
  if (defined $adj) {
    $offset -= $convert_offset->($adj);
  }

  return $offset;
}


sub rule_line {
  my ($l) = @_;
  my ($RULE, $name, $from, $to, $type, $in, $on, $at, $save, $letters) = split(m/\s+/, $l);

  # The rule lines in the time zone database describe transitions.
  #
  # Cases we have to handle:
  #
  # 1. There are no rules still active.  We need to determine the last
  # transition and apply it statically.  (Example: Ghana.)
  #
  # 2. There are active rules.  We should determine the current rules
  # (ignore old rules and future rules) and transfer them to the
  # output rules.  (Example: Morocco.)

  my $save_minutes = offset_minutes($save);

  # The rules that this function generates has these parts.
  # These are accessed using the RULE_* constants defined above.
  # 1. A description of the date on which the transition happens,
  #    or undef if this is a base rule.
  # 2. The number of minutes "saved" (the difference from the base
  #    offset for the zone).
  # 3. The time in minutes after midnight at which the transition
  #    happens.
  # 4. The letter for the new state (often 'S' for standard, or 'D'
  #    for daylight).
  # 5. The number of years the rule covers (used to select which rules
  #    to eliminate when there are more than two rules in a year).

  # Update the base rule.
  my $rule_last_active_epoch = last_active_epoch($from, $to, $in, $on);
  if (defined $rule_last_active_epoch) {
    if (! defined $rule_base{$name} || $rule_base{$name}->[0] < $rule_last_active_epoch) {
      $rule_base{$name} = [ $rule_last_active_epoch, [ undef, $save_minutes, 0, $letters, SPECIFICITY_MAX ] ];
    }
  }

  my $transform_rule = sub {
    my $rule;
    if ($on =~ m/^(\w+)>=(\d+)$/) {
      # If start day is not a multiple of a week, round it to nearest week.
      $rule = [ int(($2-1+int(DPW/2))/DPW) + 1, lc($1), lc($in) ]; # e.g. [ '2', sun', 'nov' ]
      if (($2-1) % DPW) {
	warn "on $on fuzz for $name -> $rule->[0]";
	push(@problem, "Rounded $in $on to [@$rule] in Rule $name.");
      }
    } elsif ($on =~ m/^last(\w+)$/) {
      $rule = [ 'last', lc($1), lc($in) ]; # e.g. [ 'last', 'sun', 'apr' ]
    } else {
      warn "no match for $on in rule $l";
      push(@problem, "Ignored $in $on in Rule $name.");
    }
    my $at_minutes = $at;
    if ($at =~ m/^([\+\-]?)(?:(\d+)\:)(\d+)([wsguz]?)/) {
      $at_minutes = $2 * MPH + $3; my $z = $4;
      if ($1 eq '-') { $at_minutes = -$at_minutes; }
      if ($z eq '' or $z eq 'w') {
	# Change specified at local (pre-change) wall-clock time.
	$at_minutes .= 'W';
      } elsif ($z eq 's') {
	# Change specified at local standard time.
	$at_minutes .= 'S';
      } elsif ($z eq 'g' or $z eq 'u' or $z eq 'z') {
	# Change specified at UTC.
	$at_minutes .= 'Z';
      }
    }
    my $specificity =
      $to eq 'only' ? 1 :
      $to eq 'max'  ? SPECIFICITY_MAX :
	              $to - $from + 1;

    return [ $rule, $save_minutes, $at_minutes, $letters, $specificity ];
  };

  # We ignore any rule that has a definite to (end) time.
  if (($to eq 'only' && $from == $current_year) ||
      ($to ne 'only' && ($to eq 'max' || $to >= $current_year) && $from <= $current_year)) {
    push(@{$rule{$name}}, $transform_rule->());
  }
}

# Process a Zone line and updates the %output hash.
sub zone_line {
  my ($l) = @_;
  my ($ZONE, $name, $gmtoff, $rules, $format, $until) = split(m/\s+/, $l, 6);
  # We ignore any zone line that has a definite until (end) time that
  # is in the past.
  if (defined $until) {
    my ($until_year, $until_month, $until_day) = split_ymd($until);
    if (($until_year > $current_year) ||
        (($until_year == $current_year) &&
         (($until_month > $current_month) ||
          (($until_month == $current_month) &&
           ($until_day >= $current_day))))) {
      "future until \"$until\" not handled";
    }
    return;
  }

  # Set $name1 and $rule1 for (start of) standard time.  If zone has
  # DST, set name2 and $rule2 for (start of ) DST, otherwise it is
  # equal to standard time.
  my ($name1, $name2, $rule1, $rule2);
  my @rules = @{$rule{$rules} || []};

  # If we have more than two rules, discard all except for the two
  # most-specific rules.
  if (scalar(@rules) > 2) {
    print STDERR "discarding excess rules from $rules for $name\n", Data::Dump::dump(\@rules), "\n";
    push(@problem, "Discarded excess rules for Zone $name.");
    @rules = sort { $b->[RULE_SPECIFICITY] <=> $a->[RULE_SPECIFICITY] } @rules;
    $#rules = 1;
  }

  if (scalar(@rules) == 0) {
    # No active rules, use base rule.
    my $rule0 = $rule_base{$rules}->[1];
    $name1 = zonename($format, $rule0->[RULE_LETTERS], undef);
    $name2 = undef;
    $rule1 = $rule2 = RULE_NULL;
  } elsif (scalar(@rules) == 1) {
    # One active rule.  This is a year that DST started or stopped
    # being observed.  erlang_localtime doesn't handle this.  If DST
    # stopped being observed in this year, don't output a DST rule.
    # If DST started being observed, do.  (Except that we don't handle
    # this yet.)
    print STDERR Data::Dump::dump(\@rules), "\n";
    die "one rule for $name";
    $name1 = zonename($format, $rules[0]->[RULE_LETTERS], undef);
    $name2 = undef;
    $rule1 = $rule2 = RULE_NULL;
  } else { # 2 rules
    if ($rules[0]->[RULE_SAVED] != 0) { @rules = reverse(@rules); }; # Standard time first.
    $rule1 = $rules[0];
    $rule2 = $rules[1];
    $name1 = zonename($format, $rule1->[RULE_LETTERS], 0);
    $name2 = zonename($format, $rule2->[RULE_LETTERS], 1);
  }

  my $gmtoff_min = offset_minutes($gmtoff);

  # https://github.com/dmitryme/erlang_localtime/blob/master/include/tz_database.hrl
  # Documentation for output format.
  #
  # {TimeZoneName, {StdAbbr, StdName}, {DstAbbr, DstName}, StdMin, DstMin, DstStartDay, DstStartTime, DstEndDay, DstEndTime}
  #   TimeZoneName = String(), TimeZone name, MUST be unique. It is a key
  #   StdName = {String(), String()}, name and abbreviations of timezone before daylight shift
  #   DstName = {String(), String()}, name and abbreviations of timezone after daylight shift
  #   StdMin = Integer(), GMT offset in minutes. W/o daylight savings
  #   DstMin = Integer(), daylight saving. Adjustment for GMT offset, when daylight saving is on
  #   DstStartDay = {NthWeekday, Weekday, Month}, daylight saving transition rule. Can be undef if no daylight saving rule specified
  #   	NthWeekday = Integer(), 1(first week),2(second week),3(...),4(...),5(...),-1(last week)
  #   	Weekday = atom(), sun,mon,tue,wed,thu,fri,sat
  #   	Month = atom(), jan,deb,mar,apr,may,jun,jul,aug,sep,oct,mov,dec
  #   DstStartTime = {Hour, Min} - time of daylight saving transition
  #   	Hour = Integer(), [0..23]
  #   	Min = Integer(), [0..59]
  #   DstEndDay = {NthWeekday, Weekday, Month} - transition back to std. Can be undef if no daylight saving rule specified
  #   {Hour, Min} - time of transition to std.
  #
  # Example output.
  #       {"America/Los Angeles",{"PST","Pacific Standard Time"},{"PDT","Pacific Daylight Time"},-480,60,{2,sun,mar},{2,0},{1,sun,nov},{2,0}},
  #       {"America/Puerto Rico",{"AST","AST"},undef,-240,0,undef,{0,0},undef,{0,0}},

  $output{$name} = build_term(
    '#ZONE#',						# TimeZoneName
    [ $name1, $name1 ],					# StdAbbr, StdName
    defined $name2 ? [ $name2, $name2 ] : undef,	# DstAbbr, DstName
    $gmtoff_min,					# StdMin
    $rule2->[RULE_SAVED],				# DstMin
    $rule2->[RULE_DATE],				# DstStartDay,
    rulestart($rule2->[RULE_TIME], $gmtoff_min, 0, $name),		# DstStartTime
    $rule1->[RULE_DATE],				# DstEndDay
    rulestart($rule1->[RULE_TIME], $gmtoff_min, $rule2->[RULE_SAVED], $name), # DstEndTime
  );
}

sub zonename {
  my ($format, $letter, $dst) = @_;
  if (defined($dst) && $format =~ m,/,) {
    my @format = split(m,/,, $format);
    return $format[$dst];
  } else {
    return sprintf($format, $letter eq '-' ? '' : $letter);
  }
}

# erlang_localtime wants the transition time in the local wallclock
# time before the transition.
sub rulestart {
  my ($start, $gmtoff, $save, $name) = @_;
  $start =~ m/^(\d+)(\w)$/ or die "no match for $start";
  ($start, my $modified) = ($1, $2);

  if ($modified eq 'W') {
    # No adjustment needed.
  } elsif ($modified eq 'Z') {
    $start += $gmtoff + $save;

    # If adding a (negative) GMT offset puts the time in the previous
    # day, move it up to the start of the day.  If adding a (positive)
    # GMT offset puts the time in the next day, move it back to the
    # end of the day.  I think that this is wrong, and that actually
    # the transition should be moved into the previous or next day,
    # but it's not possible to consistently handle these rules with
    # "third Sunday in May" logic.  (For example, the day before the
    # third Sunday in May may be either the second or third Saturday
    # in May.)  Having the transition off by a few hours every year is
    # better than having it be off by a week every seven years or so.
    if ($start < 0) {
      warn "moving rule to beginning of day for $name";
      push(@problem, "Moving rule to beginning of day for Zone $name.");
      $start = 0;
    } elsif ($start > HPD * MPH) {
      warn "moving rule to end of day for $name";
      push(@problem, "Moving rule to end of day for Zone $name.");
      $start = HPD * MPH;
    }
  } elsif ($modified eq 'S') {
    # To handle this properly, we would have to know the prevailing
    # wall-clock offset from standard time.  In the general case this
    # could be different from $save, but in practice all of the
    # transitions are from standard time to daylight time or vice
    # versa and not between two different offsets from standard time,
    # and so this works.
    $start += $save;
  }

  return [ int($start / MPH), $start % MPH ];
}

sub build_term {
  my @term;
  foreach my $e (@_) {
    if (! defined $e) {
      push(@term, 'undef');
    } elsif (ref($e) eq 'ARRAY') {
      push(@term, build_term(@$e));
    } elsif ($e =~ m/^(\-?\d+|[a-z].*)$/) {
      push(@term, $e);			# number or term
    } else {
      push(@term, "\"$e\"");		# string
    }
  }

  return '{' . join(',', @term) . '}';
}

sub link_line {
  my ($l) = @_;
  my ($LINK, $canon_name, $old_name) = split(m/\s+/, $l);
  if (! defined $output{$canon_name}) {
    print STDERR "no zone $canon_name for link $old_name\n";
  } else {
    $output{$old_name} = $output{$canon_name};
  }
}

my (@mon_to_name, %mon_from_name, @dow_to_name, %dow_from_name);
INIT {
  @mon_to_name = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
  %mon_from_name = map { $mon_to_name[$_] => $_ } (0..$#mon_to_name);
  @dow_to_name = qw(Sun Mon Tue Wed Thu Fri Sat);
  %dow_from_name= map { $dow_to_name[$_] => $_ } (0..$#dow_to_name);
}

sub split_ymd {
  my ($ymd) = @_;
  $ymd =~ m/^(\d+)(?:\s+(\w+)(?:\s+(\d+)))?/
    or die "parse \"$ymd\" for ymd failed";
  my $year = $1;
  my $month = do {
    if (defined $2) {
      defined $mon_from_name{$2} or die "parse \"$ymd\" for month failed";
      $mon_from_name{$2};
    } else {
      0
    }
  };
  my $day = defined $3 ? $3 : 0;
  return ($year, $month, $day);
}

sub on_to_day_of_month {
  my ($on, $year, $month) = @_;

  my $day;
  if ($on =~ m/^\d+$/) {
    $day = $on;
	print "Day extracted via regex to: $day\n";
  } else {
    my ($desired_dow, $time_base);
    if ($on =~ m/^(\w+)>=(\d+)$/) {
      $desired_dow = $dow_from_name{$1}; my $desired_day = $2;
      $time_base = timelocal(0, 0, 0, $desired_day, $month, $year);
	  print "Regex: $on => desired_dow: $desired_dow. Time_base: $time_base\n";
    } elsif ($on =~ m/^(\w+)<=(\d+)$/) {
	  $desired_dow = $dow_from_name{$1}; my $desired_day = $2;
	  $time_base = timelocal(0, 0, 0, $desired_day, $month, $year);
    } elsif ($on =~ m/^last(\w+)$/) {
      $desired_dow = $dow_from_name{$1};
      # One week before the beginning of the next month.
      $time_base = timelocal_nocheck(0, 0, 0, - DPW, $month + 1, $year);
    } else {
      die "match $on failed";
    }
    ($day, my $dow) = (localtime($time_base))[3,6];
	print "Current Day: $day\n";
	print "Comparing $dow = $desired_dow\n";
    if ($dow != $desired_dow) {
		$day += (DPW + $desired_dow - $dow) % DPW;
	}
	print "After comparison: Current Day: $day\n";
  }

  return $day;
}

# Returns the epoch that the rule was last active, or undef if the
# rule has never been active (i.e., it begins in the future).
sub last_active_epoch {
  my ($from, $to, $in, $on) = @_;
  # $from is a year.
  # $to is a year, or 'only', or 'max'.
  # $in is a month name (e.g., 'Jan').
  # $on is a day-of-month, or a day-of-week>=day-of-month, or 'last'day-of-week.

  my $month = $mon_from_name{$in};

  # First check the rule's from time; if in the future return undef.
  if ($from > $current_year) { return undef; }
  if ($from == $current_year && $month > $current_month) { return undef; }
  if ($from == $current_year && $month == $current_month) {
    my $day = on_to_day_of_month($on, $current_year, $current_month);
    if ($day > $current_day) { return undef; }
  }

  # Now check the rule's to time.  If the rule covers the current
  # year, but doesn't fire until later in the year, subtract a year.
  my $year = $to eq 'max' ? $current_year : $to eq 'only' ? $from : $to;
 event_fires_later_this_year: {
    if ($year < $current_year) { last; }
    if ($year == $current_year && $month < $current_month) { last; }
    my $day = on_to_day_of_month($on, $year, $month);
    if ($year == $current_year && $month == $current_month && $day < $current_day) { last; }
    # If we get here, this year's rule instance is not active until
    # later in the year, so subtract a year.
    $year -= 1;
  }

  my $day = on_to_day_of_month($on, $year, $month);
  print "On=$on, Year=$year, Month=$month ====> Day=$day\n";
  ## changing the below call to timelocal_nocheck stops the crash, but it feels wrong
  ## especially since without it, it seems to produce a lot of "redundant" error messages
  ## I don't have time to explore this now, but I'll come back to it
  return timelocal(0, 0, 0, $day, $month, $year);
}

